#conda install -c plotly plotly
import mysql.connector
import pandas as pd
from mysql.connector import errorcode
import plotly.express as px
Purpose: Study the relationship between global data of stressors in countries that could indicate a population’s prosperity and death by popular diseases
Examples of relationships:
Studying the potential relationships could help a country to better understand how to better deal with popular deadly diseases and in what areas they could improve to lower the death counts
# Read in both data files into pandas for use in plotly
#data = pd.read_csv('combined_data.csv')
#data1 = pd.read_csv('uhc_data.csv')
#open connection to execute an SQL query
try:
conn = mysql.connector.connect(option_files=['connection.conf','password.conf'])
db_cursor = conn.cursor()
#enter query here
query1 = ('''
SELECT
c.`Year`,
c.Country_name,
c.Mean_Annual_Pollution_Exposure_Rank,
c.Mean_School_Year_Rank,
c.GDP_Per_Capita_rank,
rank() over (partition by (u.year) order by u.UHC_Index desc) as UHC_Index_rank
FROM `l02-6`.combined AS c
join
(select Year, Country_Code, UHC_Index
from `l02-6`.uhc) as u
on c.Country_code=u.Country_code and c.Year = u.Year
order by c.Mean_Annual_Pollution_Exposure_rank asc;''')
#copy query result to the new list
data1 = pd.read_sql(query1, conn)
db_cursor.close()
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
conn.close()
display(data1.head())
display(data1.tail())
#open connection to execute an SQL query
try:
conn = mysql.connector.connect(option_files=['connection.conf','password.conf'])
db_cursor = conn.cursor()
#enter query here
query3 = ('''
SELECT
pd.Country_Name,
pd.Country_Code,
pd.Year,
pd.Liver_disease,
pd.Digestive_Disease,
pd.HIV_AIDS,
pd.Diarrheal_diseases,
pd.Intestinal_Infectious_Disease,
pd.Lower_Respiratory_Infections,
pd.Meningitis,
pd.Alzheimer_disease_and_other_dementias,
pd.Parkinson_disease,
pd.Malaria,
pd.Maternal_disorders,
pd.Neonatal_disorders,
pd.Nutritional_deficiencies,
pd.Diabetes_mellitus,
pd.Chronic_kidney_disease,
pd.Chronic_respiratory_diseases,
pd.Neoplasms,
pd.Protein_Energy_Malnutrition,
pd.Cardiovascular_Diseases,
pd.Mean_Annual_Pollution_Exposure,
eg.Mean_School_Year,
eg.GDP_Per_Capita,
cume_dist() over (partition by (year) order by pd.liver_disease ) * 100 as Liver_disease_rank,
cume_dist() over (partition by (year) order by pd.Digestive_Disease ) * 100 as Digestive_disease_rank,
cume_dist() over (partition by (year) order by pd.HIV_AIDS ) * 100 as HIV_AIDS_rank,
cume_dist() over (partition by (year) order by pd.Diarrheal_diseases ) * 100 as Diarrheal_disease_rank,
cume_dist() over (partition by (year) order by pd.Intestinal_Infectious_Disease ) * 100 as Intestinal_Infectious_disease_rank,
cume_dist() over (partition by (year) order by pd.Lower_Respiratory_Infections ) * 100 as Lower_Respiratory_Infections_rank,
cume_dist() over (partition by (year) order by pd.Meningitis ) * 100 as Meningitis_rank,
cume_dist() over (partition by (year) order by pd.Alzheimer_disease_and_other_dementias ) * 100 as Alzheimer_disease_and_other_dementias_rank,
cume_dist() over (partition by (year) order by pd.Parkinson_disease ) * 100 as Parkinson_disease_rank,
cume_dist() over (partition by (year) order by pd.Malaria ) * 100 as Malaria_rank,
cume_dist() over (partition by (year) order by pd.Maternal_disorders ) * 100 as Maternal_disorders_rank,
cume_dist() over (partition by (year) order by pd.Neonatal_disorders ) * 100 as Neonatal_disorders_rank,
cume_dist() over (partition by (year) order by pd.Nutritional_deficiencies ) * 100 as Nutritional_deficiencies_rank,
cume_dist() over (partition by (year) order by pd.Diabetes_mellitus ) * 100 as Diabetes_mellitus_rank,
cume_dist() over (partition by (year) order by pd.Chronic_kidney_disease ) * 100 as Chronic_kidney_disease_rank,
cume_dist() over (partition by (year) order by pd.Chronic_respiratory_diseases ) * 100 as Chronic_respiratory_diseases_rank,
cume_dist() over (partition by (year) order by pd.Neoplasms ) * 100 as Neoplasms_rank,
cume_dist() over (partition by (year) order by pd.Protein_Energy_Malnutrition ) * 100 as Protein_Energy_Malnutrition_rank,
cume_dist() over (partition by (year) order by pd.Cardiovascular_Diseases ) * 100 as Cardiovascular_Diseases_rank,
cume_dist() over (partition by (year) order by pd.Mean_Annual_Pollution_Exposure ) * 100 as Mean_Annual_Pollution_Exposure_rank,
cume_dist() over (partition by (year) order by eg.Mean_School_Year ) * 100 as Mean_School_Year_rank,
cume_dist() over (partition by (year) order by eg.GDP_Per_Capita ) * 100 as GDP_Per_Capita_rank
FROM
(SELECT d.*,
p.Mean_Annual_Pollution_Exposure
FROM `l02-6`.death_type AS d JOIN `l02-6`.pollution AS p
ON d.Country_Code = p.Country_Code AND d.Year = p.Year) AS pd
JOIN
(SELECT e.*, g.GDP_Per_Capita
FROM `l02-6`.avg_education AS e JOIN `l02-6`.gdp_data AS g
ON e.Country_Code = g.Country_Code AND e.Year = g.Year) AS eg
ON pd.Country_Code = eg.Country_Code AND pd.Year = eg.Year
order by Year, Mean_Annual_Pollution_Exposure_rank;
''')
#copy query result to the new list
data3 = pd.read_sql(query3, conn)
db_cursor.close()
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
conn.close()
display(data3.head())
display(data3.tail())
Or:
#Combining all five datasets
#open connection to execute an SQL query
try:
conn = mysql.connector.connect(option_files=['connection.conf','password.conf'])
db_cursor = conn.cursor()
#enter query here
query4 = ('''SELECT
c.*,
u.UHC_Index,
RANK() OVER (PARTITION BY Year ORDER BY u.UHC_Index DESC) AS UHC_Index_rank
FROM
combined AS c JOIN uhc AS u USING(Country_Code, Year);
''')
#get query results
data4 = pd.read_sql(query4, conn)
db_cursor.close()
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
conn.close()
display(data4.head())
display(data4.tail())
"Combined" dataset: A result of Albert's second query. Contains the ranking of deaths by diseases, pollution, education, and gdp per capita.
First Query:
Second Query:
#open connection to execute an SQL query
try:
conn = mysql.connector.connect(option_files=['connection.conf','password.conf'])
# Query to do an approximate join between the table that contains everything except the UHC
# dataset, and the UHC dataset for the year 2010.
query5 = ('''
select
u.Year as corresponding_uhc_year,
u.uhc_index_rank,
comb.*
from
(select c.* from combined c join
(select Country_Code, Country_Name, group_concat(Year order by abs(Year-2010)) nearest2010 from
combined group by Country_Name) t2
on c.Country_Name=t2.Country_Name and c.Year=t2.nearest2010
) comb,
(select
Country_Code, Country_Name, Year,
ntile(200) over (partition by Year order by uhc_index) as uhc_index_rank
from uhc where year=2010) u
where
comb.Country_Code=u.Country_Code;
''')
# Query to find the average of pollution, education, and gdp for each country in the first
# and last year that they are present in the dataset that contains everything except
# the UHC dataset
query6 = ('''
select
first.Country_Name, first.Year firstYear,
((184-first.mean_annual_pollution_exposure_rank) + first.mean_school_year_rank +
first.gdp_per_capita_rank)/3 first_avg,
last.Year lastYear,
((184-last.mean_annual_pollution_exposure_rank) + last.mean_school_year_rank +
last.gdp_per_capita_rank)/3 last_avg
from
(select c.* from combined c join
(select Country_Name, min(Year) minyear from combined group by Country_Name) t2
on c.Country_Name = t2.Country_Name and c.Year = t2.minyear
) first,
(select c.* from combined c join
(select Country_Name, max(Year) maxyear from combined group by Country_Name) t2
on c.Country_Name = t2.Country_Name and c.Year = t2.maxyear
) last
where
first.Country_Code = last.Country_Code order by first.Country_Code;
''')
data5 = pd.read_sql(query5, conn)
data6 = pd.read_sql(query6, conn)
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
conn.close()
# Results of the first query.
display(data5.set_index("Country_Name").head())
display(data5.set_index("Country_Name").tail())
# Results of the second query.
data6.sort_values(by="last_avg", axis=0, inplace=True)
# Get the top 10 and bottom 10.
data_top = data6.iloc[:10]
data_bottom = data6.iloc[-10:]
# Melt the dataframes, for plotting purposes.
df_top = pd.melt(data_top.loc[:, ["Country_Name", "first_avg", "last_avg"]],
id_vars=["Country_Name"], var_name="Year",
value_name="value")
df_bottom = pd.melt(data_bottom.loc[:, ["Country_Name", "first_avg", "last_avg"]],
id_vars=["Country_Name"], var_name="Year",
value_name="value")
# Sort them, again for plotting purposes.
df_top.sort_values(by=["Year", "value"], ascending=[True, True], axis=0, inplace=True)
df_bottom.sort_values(by=["Year", "value"], ascending=[True, True], axis=0, inplace=True)
fig_top = px.bar(df_top, x="Country_Name", y="value", color="Year", barmode="group",
title="Top 10 Countries In Their First Year",
labels={"Country_Name": "Country Name",
"value":"Average Rank (out of 183)"})
fig_bottom = px.bar(df_bottom, x="Country_Name", y="value", color="Year", barmode="group",
title="Bottom 10 Countries In Their First Year",
labels={"Country_Name": "Country Name",
"value":"Average Rank (out of 183)"})
fig_top.show()
fig_bottom.show()
Computing ranks across UHC index and total death count for the years 1990 and 2010 for all countries
How do the two variables relate to each other?
#open connection to execute an SQL query
try:
conn = mysql.connector.connect(option_files=['connection.conf','password.conf'])
db_cursor = conn.cursor()
#enter query here
query7 = ('''SELECT
u.Country_Name,
u.Country_Code,
u.Year,
u.UHC_Index,
(pd.Liver_disease +
pd.Digestive_Disease +
pd.HIV_AIDS +
pd.Diarrheal_diseases +
pd.Intestinal_Infectious_Disease +
pd.Lower_Respiratory_Infections +
pd.Meningitis +
pd.Alzheimer_disease_and_other_dementias +
pd.Parkinson_disease +
pd.Malaria +
pd.Maternal_disorders +
pd.Neonatal_disorders +
pd.Nutritional_deficiencies +
pd.Diabetes_mellitus +
pd.Chronic_kidney_disease +
pd.Chronic_respiratory_diseases +
pd.Neoplasms +
pd.Protein_Energy_Malnutrition +
pd.Cardiovascular_Diseases) AS Total_Death,
RANK() OVER (PARTITION BY Year ORDER BY u.UHC_Index DESC) AS UHC_Index_Rank,
RANK() OVER (PARTITION BY Year ORDER BY Total_Death DESC) AS Total_Death_Rank
FROM
death_type AS pd JOIN uhc AS u USING(Country_Code, Year);''')
#copy query result to the new list
data7 = pd.read_sql(query7, conn)
db_cursor.close()
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
conn.close()
display(data7.head())
display(data7.tail())
fig1 = px.scatter(data2,x='Mean_Annual_Pollution_Exposure_rank',y='Meningitis_rank',trendline = 'lowess')
fig1.data[1].update(line_color='red')
fig1.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig2 = px.scatter(data2,x='GDP_Per_Capita_rank',y='Meningitis_rank',trendline = 'lowess')
fig2.data[1].update(line_color='red')
fig2.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig3 = px.scatter(data2,x='Mean_School_Year_rank',y='Meningitis_rank',trendline = 'lowess')
fig3.data[1].update(line_color='red')
fig3.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig1.show()
fig2.show()
fig3.show()
fig4 = px.scatter(data2,x='Mean_Annual_Pollution_Exposure_rank',y='Diabetes_mellitus_rank',trendline = 'lowess')
fig4.data[1].update(line_color='red')
fig4.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig5 = px.scatter(data2,x='GDP_Per_Capita_rank',y='Diabetes_mellitus_rank',trendline = 'lowess')
fig5.data[1].update(line_color='red')
fig5.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig6 = px.scatter(data2,x='Mean_School_Year_rank',y='Diabetes_mellitus_rank',trendline = 'lowess')
fig6.data[1].update(line_color='red')
fig6.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig4.show()
fig5.show()
fig6.show()
fig7 = px.scatter(data2,x='Mean_Annual_Pollution_Exposure_rank',y='Neoplasms_rank',trendline = 'lowess')
fig7.data[1].update(line_color='red')
fig7.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig8 = px.scatter(data2,x='GDP_Per_Capita_rank',y='Neoplasms_rank',trendline = 'lowess')
fig8.data[1].update(line_color='red')
fig8.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig9 = px.scatter(data2,x='Mean_School_Year_rank',y='Neoplasms_rank',trendline = 'lowess')
fig9.data[1].update(line_color='red')
fig9.update_layout(autosize=False,width=1000,height=500,margin
=dict(l=50,r=50,b=100,t=100,pad=4),paper_bgcolor="LightSteelBlue",)
fig7.show()
fig8.show()
fig9.show()
Look at all relationships and categorize them into which diseases relate to which predictors to find out an overall picture
Add more factors within countries that describe other areas of a countries economic, cultural or educational standing
Population is a key factor not studied - Countries like India and China with large populations may end up skewing the data
References
Canadian Cancer Society, 2020. How Cancer Grows. [image] Available at: https://www.cancer.ca/en/cancer-information/cancer-101/what-is-cancer/how-cancer-starts-grows-and-spreads/?region=on [Accessed 2 December 2020].
Diabetes Library, 2020. How Does Insulin Work?. [image] Available at: https://diabeteslibrary.org/insulin-vs-glucagon/ [Accessed 2 December 2020].
Medical Stocks, 2020. Meninges Medical 3D Illustration. [image] Available at: https://www.dreamstime.com/meninges-medical-d-illustration-white-background-eps-cross-view-human-brain-meninges-medical-d-illustration-white-image142325632 [Accessed 2 December 2020].